<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang="en">
<head>
  <meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>
<meta name="theme-color" content="#222">
<script src="//cdn.bootcss.com/pace/1.0.2/pace.min.js"></script>
<link href="//cdn.bootcss.com/pace/1.0.2/themes/pink/pace-theme-flash.css" rel="stylesheet">
<style>
    .pace .pace-progress {
        background: #000000; /*进度条颜色*/
        height: 3px;
    }
    .pace .pace-progress-inner {
         box-shadow: 0 0 10px #1E92FB, 0 0 5px     #1E92FB; /*阴影颜色*/
    }
    .pace .pace-activity {
        border-top-color: #808080;    /*上边框颜色*/
        border-left-color: #808080;    /*左边框颜色*/
    }
</style>



  
  
    
    
  <script src="/lib/pace/pace.min.js?v=1.0.2"></script>
  <link href="/lib/pace/pace-theme-minimal.min.css?v=1.0.2" rel="stylesheet">







<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />
















  
  
  <link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css" />







<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css" />


  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="CTF," />





  <link rel="alternate" href="/rss2.xml" title="diao-diao-UPUP" type="application/atom+xml" />






<meta name="description" content="XPath之前实现了最基本的爬虫，使用正则表达式来获取页面，但是正则表达式在遇到一些复杂问题的时候就会变得繁琐，有一个地方写错了就有可能导致匹配失败，所以引入解析库来帮助获取 XPath是XML路径语言，但是同时也适用于HTML文档的搜索 常用规则    表达式 描述      nodename 选取此节点的所有子节点    &#x2F; 从当前节点选取直接子节点   &#x2F;&#x2F; 从当前节点选取子孙节点   .">
<meta name="keywords" content="CTF">
<meta property="og:type" content="article">
<meta property="og:title" content="Python3-2020&#x2F;3&#x2F;27-周报5">
<meta property="og:url" content="http:&#x2F;&#x2F;yoursite.com&#x2F;2020&#x2F;03&#x2F;27&#x2F;Python3-2020-3-27-%E5%91%A8%E6%8A%A55&#x2F;index.html">
<meta property="og:site_name" content="diao-diao-UPUP">
<meta property="og:description" content="XPath之前实现了最基本的爬虫，使用正则表达式来获取页面，但是正则表达式在遇到一些复杂问题的时候就会变得繁琐，有一个地方写错了就有可能导致匹配失败，所以引入解析库来帮助获取 XPath是XML路径语言，但是同时也适用于HTML文档的搜索 常用规则    表达式 描述      nodename 选取此节点的所有子节点    &#x2F; 从当前节点选取直接子节点   &#x2F;&#x2F; 从当前节点选取子孙节点   .">
<meta property="og:locale" content="en">
<meta property="og:updated_time" content="2020-03-27T06:05:23.511Z">
<meta name="twitter:card" content="summary">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: 'Author'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://yoursite.com/2020/03/27/Python3-2020-3-27-周报5/"/>





  <title>Python3-2020/3/27-周报5 | diao-diao-UPUP</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="en">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/"  class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">diao-diao-UPUP</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle">我是一只web狗</p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />
            
            Home
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/about" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />
            
            About
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/tags" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />
            
            Tags
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/categories" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-question-circle"></i> <br />
            
            Categories
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" target="_blank" rel="noopener" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br />
            
            Search
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off"
             placeholder="Searching..." spellcheck="false"
             type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://yoursite.com/2020/03/27/Python3-2020-3-27-%E5%91%A8%E6%8A%A55/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="51nd0re1">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="/images/header.jpg">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="diao-diao-UPUP">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">Python3-2020/3/27-周报5</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">Posted on</span>
              
              <time title="Post created" itemprop="dateCreated datePublished" datetime="2020-03-27T11:16:26+08:00">
                2020-03-27
              </time>
            

            

            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">In</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E6%97%A5%E5%B8%B8%E5%AD%A6%E4%B9%A0/" itemprop="url" rel="index">
                    <span itemprop="name">日常学习</span>
                  </a>
                </span>

                
                
                  , 
                
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E6%97%A5%E5%B8%B8%E5%AD%A6%E4%B9%A0/Python3/" itemprop="url" rel="index">
                    <span itemprop="name">Python3</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
          

          
          

          

          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <h1 id="XPath"><a href="#XPath" class="headerlink" title="XPath"></a>XPath</h1><p>之前实现了最基本的爬虫，使用正则表达式来获取页面，但是正则表达式在遇到一些复杂问题的时候就会变得繁琐，有一个地方写错了就有可能导致匹配失败，所以引入解析库来帮助获取</p>
<p>XPath是XML路径语言，但是同时也适用于HTML文档的搜索</p>
<h2 id="常用规则"><a href="#常用规则" class="headerlink" title="常用规则"></a>常用规则</h2><div class="table-container">
<table>
<thead>
<tr>
<th>表达式</th>
<th>描述 </th>
</tr>
</thead>
<tbody>
<tr>
<td>nodename</td>
<td>选取此节点的所有子节点 </td>
</tr>
<tr>
<td>/</td>
<td>从当前节点选取直接子节点</td>
</tr>
<tr>
<td>//</td>
<td>从当前节点选取子孙节点</td>
</tr>
<tr>
<td>.</td>
<td>选取当前节点</td>
</tr>
<tr>
<td>..</td>
<td>选取当前节点的父节点</td>
</tr>
<tr>
<td>@</td>
<td>选取属性</td>
</tr>
</tbody>
</table>
</div>
<p>常用匹配规则：<br><figure class="highlight autoit"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">//title[<span class="symbol">@lang</span>=<span class="string">'eng'</span>]</span><br></pre></td></tr></table></figure><br>它代表选择所有名称为title,同时属性lang的值为eng的节点</p>
<h2 id="实例"><a href="#实例" class="headerlink" title="实例"></a>实例</h2><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br></pre></td><td class="code"><pre><span class="line">from lxml import etree</span><br><span class="line"></span><br><span class="line">text='''</span><br><span class="line"><span class="tag">&lt;<span class="name">div</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">ul</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span><span class="attr">item-0</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link1.html"</span>&gt;</span>first item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span><span class="attr">item-1</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link2.html"</span>&gt;</span>second item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span><span class="attr">item-2</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link3.html"</span>&gt;</span>third item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span><span class="attr">item-3</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link4.html"</span>&gt;</span>fourth item<span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">ul</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line">'''</span><br><span class="line"></span><br><span class="line">html=etree.HTML(text)</span><br><span class="line">result=etree.tostring(html)</span><br><span class="line">print(result.decode('utf-8'))</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span><span class="tag">&lt;<span class="name">body</span>&gt;</span><span class="tag">&lt;<span class="name">div</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">ul</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span> <span class="attr">item-0</span>=<span class="string">""</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link1.html"</span>&gt;</span>first item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span> <span class="attr">item-1</span>=<span class="string">""</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link2.html"</span>&gt;</span>second item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span> <span class="attr">item-2</span>=<span class="string">""</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link3.html"</span>&gt;</span>third item<span class="tag">&lt;/<span class="name">a</span>&gt;</span><span class="tag">&lt;/<span class="name">li</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">li</span> <span class="attr">class</span>=<span class="string">""</span> <span class="attr">item-3</span>=<span class="string">""</span>&gt;</span><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"link4.html"</span>&gt;</span>fourth item<span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">li</span>&gt;</span><span class="tag">&lt;/<span class="name">ul</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">div</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">body</span>&gt;</span><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p>首先声明了一段HTML文本，调用HTML类进行初始化，这里就成功构造了一个XPath解析对象</p>
<p>可以看到缺一个li标签的结尾，但是etree模块可以自动修正HTML文本</p>
<p>再调用tostring()方法即可输出修正后的结果，但是结果是bytes类型，使用decode(）方法转成str类型</p>
<p>li标签补全了，还自动添加了body html节点</p>
<h2 id="所有节点"><a href="#所有节点" class="headerlink" title="所有节点"></a>所有节点</h2><p>我们一般会用//开头的XPath规则来选取所有符合要求的节点<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//*')</span><br><span class="line"><span class="builtin-name">print</span>(result)</span><br></pre></td></tr></table></figure><br>这里使用*代表匹配所有节点，也就是HTML文本中的所有节点都会被获取，返回结果是一个列表，每个元素是Element类型，其后跟着节点名称，如html,body,div,ul,li,a等</p>
<p>如果只想获取li标签<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li')</span><br><span class="line"><span class="builtin-name">print</span>(result)</span><br><span class="line"><span class="builtin-name">print</span>(result[0])</span><br></pre></td></tr></table></figure></p>
<p>想取出某个元素，使用[number]就行</p>
<h2 id="子节点"><a href="#子节点" class="headerlink" title="子节点"></a>子节点</h2><p>我们可以通过/或//查找元素的子节点或子孙节点<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li/a')</span><br><span class="line"><span class="builtin-name">print</span>(result)</span><br></pre></td></tr></table></figure><br>这样就获取了li节点下的所有a子节点</p>
<p>此处的/用于选取直接子节点，如果想要获取所有子孙节点，就可以使用//<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li//a')</span><br><span class="line"><span class="builtin-name">print</span>(result)</span><br></pre></td></tr></table></figure><br>这样就获取了li节点下的所有a节点，输出结果一样</p>
<p>但是如果使用<code>//li/a</code>这样的情况，如果li下没有直接的a子节点，那么救无法获取任何匹配结果</p>
<h2 id="父节点"><a href="#父节点" class="headerlink" title="父节点"></a>父节点</h2><p>寻找父节点的方式与返回根目录的方式基本一样，使用..来返回</p>
<figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//a[@<span class="attribute">href</span>=<span class="string">"link4.html"</span>]/<span class="built_in">..</span>/@class<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br><span class="line"><span class="string"></span></span><br><span class="line"><span class="string">输出结果：</span></span><br><span class="line"><span class="string">['</span>item-1<span class="string">']</span></span><br></pre></td></tr></table></figure>
<p>这里可以看到基本和目录方式一致，想要查找的a子节点的属性，之后跟上想要获取的父节点</p>
<figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//a[@<span class="attribute">href</span>=<span class="string">"link4.html"</span>]/parent::*/@class<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br><span class="line"><span class="string"></span></span><br><span class="line"><span class="string">输出结果：</span></span><br><span class="line"><span class="string">['</span>item-1<span class="string">']</span></span><br></pre></td></tr></table></figure>
<p>也可以使用parent::来获取父节点</p>
<h2 id="属性匹配"><a href="#属性匹配" class="headerlink" title="属性匹配"></a>属性匹配</h2><p>我们可以使用@符号来进行属性过滤<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li[@<span class="attribute">class</span>=<span class="string">"item-0"</span>]<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br></pre></td></tr></table></figure></p>
<p>这里使用了@符号限定了class的值为item-0，所以会返回class属性为item-0的所有li节点</p>
<h2 id="文本获取"><a href="#文本获取" class="headerlink" title="文本获取"></a>文本获取</h2><p>我们可以使用text()方法来获取节点中的文本<br><figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li[@<span class="attribute">class</span>=<span class="string">"item-0/text()"</span>]<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br></pre></td></tr></table></figure></p>
<p>这种方法不太推荐，是个错误示范，返回结果可以不会有，因为/在XPath中的意思是匹配直接子节点，如果没有直接子节点，就不会返回</p>
<p>所以可以使用//或者先访问直接子节点a再text()</p>
<figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li[@<span class="attribute">class</span>=<span class="string">"item-0//text()"</span>]<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br></pre></td></tr></table></figure>
<figure class="highlight routeros"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml import etree</span><br><span class="line"></span><br><span class="line"><span class="attribute">html</span>=etree.parse('./test.html',etree.HTMLParser())</span><br><span class="line"><span class="attribute">result</span>=html.xpath('//li[@<span class="attribute">class</span>=<span class="string">"item-0/a/text()"</span>]<span class="string">')</span></span><br><span class="line"><span class="string">print(result)</span></span><br></pre></td></tr></table></figure>
<h2 id="属性获取"><a href="#属性获取" class="headerlink" title="属性获取"></a>属性获取</h2><p>我们知道使用text()可以获取节点内所有文本，节点属性我们可以使用@符号</p>
<figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br></pre></td><td class="code"><pre><span class="line">from lxml import etree</span><br><span class="line"></span><br><span class="line">html=etree.parse(<span class="string">'./test.html'</span>,etree.HTMLParser())</span><br><span class="line">result=<span class="selector-tag">html</span>.xpath(<span class="string">'//li/a/@href"]'</span>)</span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(result)</span></span></span><br></pre></td></tr></table></figure>
<p>获取所有li节点下所有a节点的href属性</p>
<h2 id="属性多值匹配"><a href="#属性多值匹配" class="headerlink" title="属性多值匹配"></a>属性多值匹配</h2><p>有时候一个属性可能有多个值<br><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml <span class="keyword">import</span> etree</span><br><span class="line"></span><br><span class="line">text=<span class="string">'''</span></span><br><span class="line"><span class="string">&lt;li class="li li-first"&gt;&lt;a href="link.html"&gt;first item&lt;/a&gt;&lt;/li&gt;</span></span><br><span class="line"><span class="string">'''</span></span><br><span class="line">html=etree.HTML(text)</span><br><span class="line">result=html.xpath(<span class="string">'//li[@class="li"]/a/text()'</span>)</span><br><span class="line">print(result)</span><br></pre></td></tr></table></figure><br>这个例子中li有两个属性li和li-first，使用之前的方法就无法匹配了，这样就引入我们的contains()方法，第一个参数传入属性名称，第二个参数传入属性值</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml <span class="keyword">import</span> etree</span><br><span class="line"></span><br><span class="line">text=<span class="string">'''</span></span><br><span class="line"><span class="string">&lt;li class="li li-first"&gt;&lt;a href="link.html"&gt;first item&lt;/a&gt;&lt;/li&gt;</span></span><br><span class="line"><span class="string">'''</span></span><br><span class="line">html=etree.HTML(text)</span><br><span class="line">result=html.xpath(<span class="string">'//li[contains(@class,"li")]/a/text()'</span>)</span><br><span class="line">print(result)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line">first item</span><br></pre></td></tr></table></figure>
<p>这样就能取出文本内容了</p>
<h2 id="多属性匹配"><a href="#多属性匹配" class="headerlink" title="多属性匹配"></a>多属性匹配</h2><p>还要一种情况就是多个属性确定一个节点，这个时候就需要使用and运算符来连接<br><figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">from</span> lxml <span class="keyword">import</span> etree</span><br><span class="line"></span><br><span class="line">text=<span class="string">'''</span></span><br><span class="line"><span class="string">&lt;li class="li li-first" name="item"&gt;&lt;a href="link.html"&gt;first item&lt;/a&gt;&lt;/li&gt;</span></span><br><span class="line"><span class="string">'''</span></span><br><span class="line">html=etree.HTML(text)</span><br><span class="line">result=html.xpath(<span class="string">'//li[contains(@class,"li") and @name="item"]/a/text()'</span>)</span><br><span class="line">print(result)</span><br></pre></td></tr></table></figure><br>这里li又增加了一个属性name。要确定就需要同时根据class和name属性来选择，一个条件是class中的li字符串，一个是name属性为item字符串</p>
<p>除了and，XPath还有很多运算符</p>
<div class="table-container">
<table>
<thead>
<tr>
<th>运算符</th>
<th>描述</th>
<th>实例</th>
<th>返回值 </th>
</tr>
</thead>
<tbody>
<tr>
<td>or</td>
<td>或</td>
<td>age=19 or age=20</td>
<td>如果age=19，则返回true。如果是age=21，则返回false</td>
</tr>
<tr>
<td>and</td>
<td>与</td>
<td>age&gt;19 and age&lt;21</td>
<td>如果age=20，则返回true</td>
</tr>
<tr>
<td>mod</td>
<td>计算除法的余数</td>
<td>5 mod 2</td>
<td>1</td>
</tr>
<tr>
<td>+</td>
<td>加法</td>
<td>6+4</td>
<td>10</td>
</tr>
<tr>
<td>-</td>
<td>减法</td>
<td>6-4</td>
<td>2</td>
</tr>
<tr>
<td>*</td>
<td>乘法</td>
<td>6*4</td>
<td>24</td>
</tr>
<tr>
<td>div</td>
<td>除法</td>
<td>8 div 4</td>
<td>2</td>
</tr>
<tr>
<td>=</td>
<td>等于</td>
<td>age=19</td>
<td>age=19，则返回true</td>
</tr>
</tbody>
</table>
</div>
<p>大于小于，小于（大于）等于，不等于用法与以往相同 不多介绍</p>
<h1 id="Beautiful-Soup"><a href="#Beautiful-Soup" class="headerlink" title="Beautiful Soup"></a>Beautiful Soup</h1><p>Beautiful Soup和XPath类似，也是一个解析库，但是相比较XPath更加方便快捷</p>
<p>Beautiful Soup会自动将输入文档转换为Unicode编码，输出文档转换为UTF-8编码，你不需要考虑编码方式，除非文档没有指定编码方式</p>
<h2 id="解析器"><a href="#解析器" class="headerlink" title="解析器"></a>解析器</h2><div class="table-container">
<table>
<thead>
<tr>
<th>解析器</th>
<th>使用方法</th>
<th>优势</th>
<th>劣势 </th>
</tr>
</thead>
<tbody>
<tr>
<td>Python标准库</td>
<td>BeautifulSoup(markup,”html.parser”)</td>
<td>Python的内置标准库，执行速度适中，文档容错率强</td>
<td>Python2.7.3及Python3.2.2之前的版本容错率差</td>
</tr>
<tr>
<td>lxml HTML解析库</td>
<td>BeautifulSoup(markup,”lxml”)</td>
<td>速度快，文档容错能力强</td>
<td>需要安装C语言库</td>
</tr>
<tr>
<td>lxml XML解析器</td>
<td>BeautifulSoup(markup,”xml”)</td>
<td>速度快，唯一支持XML的解析器</td>
<td>需要安装C语言库</td>
</tr>
<tr>
<td>html5lib</td>
<td>BeautifulSoup(markup,”html5lib”)</td>
<td>最好的容错性，以浏览器方式解析文档，生成HTML5格式的文档</td>
<td>速度慢，不依赖外部扩展</td>
</tr>
</tbody>
</table>
</div>
<h2 id="基本用法"><a href="#基本用法" class="headerlink" title="基本用法"></a>基本用法</h2><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br><span class="line">51</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span><span class="tag">&lt;<span class="name">head</span>&gt;</span><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span><span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>Once upon a time there were three little sisters; and their names were</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span><span class="comment">&lt;!-- Elsie --&gt;</span><span class="tag">&lt;/<span class="name">a</span>&gt;</span>,</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span> and</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>;</span><br><span class="line">and they lived at the bottom of a well.<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, 'lxml')</span><br><span class="line">print(soup.prettify())</span><br><span class="line">print(soup.title.string)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line"> <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">title</span>&gt;</span></span><br><span class="line">   The Dormouse's story</span><br><span class="line">  <span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line"> <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line"> <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span></span><br><span class="line">   <span class="tag">&lt;<span class="name">b</span>&gt;</span></span><br><span class="line">    The Dormouse's story</span><br><span class="line">   <span class="tag">&lt;/<span class="name">b</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">   Once upon a time there were three little sisters; and their names were</span><br><span class="line">   <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line">    <span class="comment">&lt;!-- Elsie --&gt;</span></span><br><span class="line">   <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">   ,</span><br><span class="line">   <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span></span><br><span class="line">    Lacie</span><br><span class="line">   <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">   and</span><br><span class="line">   <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span></span><br><span class="line">    Tillie</span><br><span class="line">   <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">   ;</span><br><span class="line">and they lived at the bottom of a well.</span><br><span class="line">  <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">  <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">   ...</span><br><span class="line">  <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"> <span class="tag">&lt;/<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">html</span>&gt;</span></span><br><span class="line">The Dormouse's story</span><br></pre></td></tr></table></figure>
<p>prettify()方法可以把要解析的字符串以标准的缩进格式输出，这里可以看到输出结果中包括了body和html节点，说明了Beautiful Soup可以自动更正格式</p>
<p>soup.tile可以选出HTML中的title节点，string属性可以直接得到里面的文本</p>
<h2 id="节点选择器"><a href="#节点选择器" class="headerlink" title="节点选择器"></a>节点选择器</h2><p>直接调用节点的名称就可以选择节点元素，再调用string属性就可以得到节点内的文本</p>
<h3 id="选择元素"><a href="#选择元素" class="headerlink" title="选择元素"></a>选择元素</h3><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span><span class="tag">&lt;<span class="name">head</span>&gt;</span><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span><span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>Once upon a time there were three little sisters; and their names were</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span><span class="comment">&lt;!-- Elsie --&gt;</span><span class="tag">&lt;/<span class="name">a</span>&gt;</span>,</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span> and</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>;</span><br><span class="line">and they lived at the bottom of a well.<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, 'lxml')</span><br><span class="line">print(soup.title)</span><br><span class="line">print(type(soup.title))</span><br><span class="line">print(soup.title.string)</span><br><span class="line">print(soup.head)</span><br><span class="line">print(soup.p)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line"><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">class</span> '<span class="attr">bs4.element.Tag</span>'&gt;</span></span><br><span class="line">The Dormouse's story</span><br><span class="line"><span class="tag">&lt;<span class="name">head</span>&gt;</span><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span><span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br></pre></td></tr></table></figure>
<p>打印title节点的选择结果，输出结果就是title节点家里面的文字内容，类型是bs4.element.Tag类型，这是Beautiful Soup中一个重要的数据结构，经过选择器选择后结果都是这种Tag类型，Tag类型具有一些属性，例如string</p>
<p>我们还注意到，只输出了一个p标签的内容，可以得到这种选择方式只会选择到第一个匹配的节点</p>
<h3 id="提取信息"><a href="#提取信息" class="headerlink" title="提取信息"></a>提取信息</h3><p>使用string属性可以获取文本的值，节点属性的值有几种获取方法：</p>
<h4 id="获取名称"><a href="#获取名称" class="headerlink" title="获取名称"></a>获取名称</h4><p>使用name属性获取节点的名称，选取title节点，然后调用name属性就可以获得节点名称<br><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span><span class="tag">&lt;<span class="name">head</span>&gt;</span><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span><span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>Once upon a time there were three little sisters; and their names were</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span><span class="comment">&lt;!-- Elsie --&gt;</span><span class="tag">&lt;/<span class="name">a</span>&gt;</span>,</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span> and</span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>;</span><br><span class="line">and they lived at the bottom of a well.<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, 'lxml')</span><br><span class="line">print(soup.title.name)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line">title</span><br></pre></td></tr></table></figure></p>
<h4 id="获取属性"><a href="#获取属性" class="headerlink" title="获取属性"></a>获取属性</h4><p>每个节点可能有多个属性，比如id和class等，选择这个节点元素后，调用attrs获取所有的属性<br><figure class="highlight dust"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line"><span class="xml">html = """</span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">html</span>&gt;</span><span class="tag">&lt;<span class="name">head</span>&gt;</span><span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span><span class="tag">&lt;/<span class="name">head</span>&gt;</span></span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"title"</span> <span class="attr">name</span>=<span class="string">"dromouse"</span>&gt;</span><span class="tag">&lt;<span class="name">b</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">b</span>&gt;</span><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>Once upon a time there were three little sisters; and their names were</span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span><span class="comment">&lt;!-- Elsie --&gt;</span><span class="tag">&lt;/<span class="name">a</span>&gt;</span>,</span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span> and</span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>;</span></span><br><span class="line"><span class="xml">and they lived at the bottom of a well.<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span></span><br><span class="line"><span class="xml"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span></span><br><span class="line"><span class="xml">"""</span></span><br><span class="line"><span class="xml">from bs4 import BeautifulSoup</span></span><br><span class="line"><span class="xml">soup = BeautifulSoup(html, 'lxml')</span></span><br><span class="line"><span class="xml">print(soup.p.attrs)</span></span><br><span class="line"><span class="xml">print(soup.p.attrs['name'])</span></span><br><span class="line"></span><br><span class="line"><span class="xml">输出结果：</span></span><br><span class="line"><span class="template-variable">&#123;'class': ['title'], 'name': 'dromouse'&#125;</span></span><br><span class="line"><span class="xml">dromouse</span></span><br></pre></td></tr></table></figure><br>可以看到attrs返回结果是字典类型</p>
<p>还有一种更简单的获取方式，不用attrs<br><figure class="highlight stylus"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="title">print</span><span class="params">(soup.p[<span class="string">'name'</span>])</span></span></span><br><span class="line"><span class="function"><span class="title">print</span><span class="params">(soup.p[<span class="string">'class'</span>])</span></span></span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line">dromouse</span><br><span class="line">[<span class="string">'title'</span>]</span><br></pre></td></tr></table></figure><br>由于一个节点元素可有多个class,所以class返回的是列表</p>
<h3 id="嵌套选择"><a href="#嵌套选择" class="headerlink" title="嵌套选择"></a>嵌套选择</h3><p>我们如果想获取head节点元素中的head元素，就可以使用嵌套查询，很简单，就是在选中元素上再选中元素<br><figure class="highlight pgsql"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line">&lt;html&gt;&lt;head&gt;&lt;title&gt;The Dormouse's story&lt;/title&gt;&lt;/head&gt;</span><br><span class="line">&lt;body&gt;</span><br><span class="line">"""</span><br><span class="line"><span class="keyword">from</span> bs4 <span class="keyword">import</span> BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, <span class="string">'lxml'</span>)</span><br><span class="line">print(soup.head.title)</span><br><span class="line">print(<span class="keyword">type</span>(soup.head.title))</span><br><span class="line">print(soup.head.title.string)</span><br><span class="line"></span><br><span class="line">输出结果;</span><br><span class="line">&lt;title&gt;The Dormous<span class="string">e's story&lt;/title&gt;</span></span><br><span class="line"><span class="string">&lt;class '</span>bs4.element.Tag<span class="string">'&gt;</span></span><br><span class="line"><span class="string">The Dormouse'</span>s story</span><br></pre></td></tr></table></figure></p>
<p>这样就实现了嵌套选择节点</p>
<h3 id="关联选择"><a href="#关联选择" class="headerlink" title="关联选择"></a>关联选择</h3><p>有时候不能一步选到想要的节点元素，需要先选中某一个节点元素，然后以它为基准再选择它的子节点，父节点，兄弟节点等</p>
<h4 id="子节点和子孙节点"><a href="#子节点和子孙节点" class="headerlink" title="子节点和子孙节点"></a>子节点和子孙节点</h4><p>选取节点元素后，想要获取它的子节点和子孙节点可以调用contents属性<br><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">            Once upon a time there were three little sisters; and their names were</span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line">                <span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line">            <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">            and</span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">            and they lived at the bottom of a well.</span><br><span class="line">        <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup=BeautifulSoup(html,'lxml')</span><br><span class="line">print(soup.p.contents)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line">['\n            Once upon a time there were three little sisters; and their names were\n            ', <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span>, '\n', <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/lacie"</span> <span class="attr">id</span>=<span class="string">"link2"</span>&gt;</span>Lacie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>, '\n            and\n            ', <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/tillie"</span> <span class="attr">id</span>=<span class="string">"link3"</span>&gt;</span>Tillie<span class="tag">&lt;/<span class="name">a</span>&gt;</span>, '\n            and they lived at the bottom of a well.\n        ']</span><br></pre></td></tr></table></figure><br>可以看到返回结果是列表形式，p节点中既包含文本，又包含节点，最后会将把他们以列表形式统一返回，值得注意的是列表中的每个元素都是p节点的直接子节点，content属性返回的结果是直接子节点的列表</p>
<p>我们还可以使用children属性获得相应的结果<br><figure class="highlight livecodeserver"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br></pre></td><td class="code"><pre><span class="line"><span class="built_in">from</span> bs4 import BeautifulSoup</span><br><span class="line"></span><br><span class="line">soup = BeautifulSoup(html, <span class="string">'lxml'</span>)</span><br><span class="line">print(soup.p.children)</span><br><span class="line"><span class="keyword">for</span> i, child <span class="keyword">in</span> enumerate(soup.p.children):</span><br><span class="line">    print(i, child)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line">&lt;list_iterator object <span class="keyword">at</span> <span class="number">0x000002711FDF8520</span>&gt;</span><br><span class="line"><span class="number">0</span> </span><br><span class="line">            Once upon <span class="keyword">a</span> <span class="built_in">time</span> there were <span class="literal">three</span> little sisters; <span class="keyword">and</span> their names were</span><br><span class="line">            </span><br><span class="line"><span class="number">1</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/elsie"</span> id=<span class="string">"link1"</span>&gt;</span><br><span class="line">&lt;span&gt;Elsie&lt;/span&gt;</span><br><span class="line">&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">2</span> </span><br><span class="line"></span><br><span class="line"><span class="number">3</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/lacie"</span> id=<span class="string">"link2"</span>&gt;Lacie&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">4</span> </span><br><span class="line">            <span class="keyword">and</span></span><br><span class="line">            </span><br><span class="line"><span class="number">5</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/tillie"</span> id=<span class="string">"link3"</span>&gt;Tillie&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">6</span> </span><br><span class="line">            <span class="keyword">and</span> they lived <span class="keyword">at</span> <span class="keyword">the</span> bottom <span class="keyword">of</span> <span class="keyword">a</span> well.</span><br></pre></td></tr></table></figure><br>同样是HTML文本，children属性来选择，返回结果是生成器类型</p>
<p>如果想要得到所有的子孙节点，可以调用descendants属性：<br><figure class="highlight livecodeserver"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br></pre></td><td class="code"><pre><span class="line"></span><br><span class="line"><span class="built_in">from</span> bs4 import BeautifulSoup</span><br><span class="line"></span><br><span class="line">soup = BeautifulSoup(html, <span class="string">'lxml'</span>)</span><br><span class="line">print(soup.p.descendants)</span><br><span class="line"><span class="keyword">for</span> i, child <span class="keyword">in</span> enumerate(soup.p.descendants):</span><br><span class="line">    print(i, child)</span><br><span class="line">    </span><br><span class="line">输出结果：</span><br><span class="line">&lt;generator object Tag.descendants <span class="keyword">at</span> <span class="number">0x000002005FC8AC10</span>&gt;</span><br><span class="line"><span class="number">0</span> </span><br><span class="line">            Once upon <span class="keyword">a</span> <span class="built_in">time</span> there were <span class="literal">three</span> little sisters; <span class="keyword">and</span> their names were</span><br><span class="line">            </span><br><span class="line"><span class="number">1</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/elsie"</span> id=<span class="string">"link1"</span>&gt;</span><br><span class="line">&lt;span&gt;Elsie&lt;/span&gt;</span><br><span class="line">&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">2</span> </span><br><span class="line"></span><br><span class="line"><span class="number">3</span> &lt;span&gt;Elsie&lt;/span&gt;</span><br><span class="line"><span class="number">4</span> Elsie</span><br><span class="line"><span class="number">5</span> </span><br><span class="line"></span><br><span class="line"><span class="number">6</span> </span><br><span class="line"></span><br><span class="line"><span class="number">7</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/lacie"</span> id=<span class="string">"link2"</span>&gt;Lacie&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">8</span> Lacie</span><br><span class="line"><span class="number">9</span> </span><br><span class="line">            <span class="keyword">and</span></span><br><span class="line">            </span><br><span class="line"><span class="number">10</span> &lt;<span class="keyword">a</span> class=<span class="string">"sister"</span> href=<span class="string">"http://example.com/tillie"</span> id=<span class="string">"link3"</span>&gt;Tillie&lt;/<span class="keyword">a</span>&gt;</span><br><span class="line"><span class="number">11</span> Tillie</span><br><span class="line"><span class="number">12</span> </span><br><span class="line">            <span class="keyword">and</span> they lived <span class="keyword">at</span> <span class="keyword">the</span> bottom <span class="keyword">of</span> <span class="keyword">a</span> well.</span><br></pre></td></tr></table></figure><br>我们可以看到这次输出结果就包含了span节点，也就是输出了所有子节点，包括子孙节点</p>
<h4 id="父节点和祖先节点"><a href="#父节点和祖先节点" class="headerlink" title="父节点和祖先节点"></a>父节点和祖先节点</h4><p>想要获得某个节点元素的父节点，可以调用parent属性：<br><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">head</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">title</span>&gt;</span>The Dormouse's story<span class="tag">&lt;/<span class="name">title</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;/<span class="name">head</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">            Once upon a time there were three little sisters; and their names were</span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line">                <span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line">            <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span>...<span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, 'lxml')</span><br><span class="line">print(soup.a.parent)</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">            Once upon a time there were three little sisters; and their names were</span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br></pre></td></tr></table></figure><br>我们选择的是第一个a标签的父节点元素，所以就是p节点，返回的内容便是p节点中所有的内容</p>
<p>如果我们想获取所有的祖先节点，可以调用parents属性<br><figure class="highlight xml"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br></pre></td><td class="code"><pre><span class="line">html = """</span><br><span class="line"><span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line">    <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line">            <span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line">                <span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line">            <span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line">        <span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line">"""</span><br><span class="line">from bs4 import BeautifulSoup</span><br><span class="line">soup = BeautifulSoup(html, 'lxml')</span><br><span class="line">print(type(soup.a.parents))</span><br><span class="line">print(list(enumerate(soup.a.parents)))</span><br><span class="line"></span><br><span class="line">输出结果：</span><br><span class="line"><span class="tag">&lt;<span class="name">class</span> '<span class="attr">generator</span>'&gt;</span></span><br><span class="line">[(0, <span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">p</span>&gt;</span>), (1, <span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">body</span>&gt;</span>), (2, <span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">body</span>&gt;</span><span class="tag">&lt;/<span class="name">html</span>&gt;</span>), (3, <span class="tag">&lt;<span class="name">html</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">body</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">p</span> <span class="attr">class</span>=<span class="string">"story"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">class</span>=<span class="string">"sister"</span> <span class="attr">href</span>=<span class="string">"http://example.com/elsie"</span> <span class="attr">id</span>=<span class="string">"link1"</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>Elsie<span class="tag">&lt;/<span class="name">span</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">p</span>&gt;</span></span><br><span class="line"><span class="tag">&lt;/<span class="name">body</span>&gt;</span><span class="tag">&lt;/<span class="name">html</span>&gt;</span>)]</span><br></pre></td></tr></table></figure><br>我们可以看到，使用了列表和枚举类型来输出，输出的第一个元素是p标签所有的内容，然后是body，也就是p标签的父节点，之后是html节点也就是body的父节点</p>
<h1 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h1><p>1.XPath中/为访问直接子节点，//为访问所有子节点</p>
<p>2.在查询子节点的时候，虽然可以使用/，但是不推荐，应该如果没有直接子节点就不会返回内容</p>
<p>3.在使用text()方法的时候，推荐使用//，这样可以确保查找到所有子节点，而不会导致没有直接子节点从而没有返回结果的情况</p>
<p>4.属性匹配是中括号加属性名和值来限定某个属性，如[@href=”link1.html”]，而此处的@href指的是获取节点的某个属性，二者需要做好区分</p>
<p>5.Beautiful Soup比XPath更加方便快捷，他们两者的使用方法很类似，XPath用//代表访问子孙节点，而BS使用descendants属性，使用更加方便，便于理解</p>
<p>6.children属性只能获取直接子节点，和XPath中的/类似</p>
<p>7.XPath中并没有提供方法获取父节点或祖先节点，BS中可以使用parent和parents属性来获取父节点和祖先节点</p>
<p>8.注意BS在使用parent或parents属性时，输出的内容是由内到外遍历，也就是从小到大以此输出该节点中的所有内容</p>
<p>9.如果某个节点中包含了子节点，那么这个子节点中的文本内容在输出父节点时，也属于节点，会随着一起输出</p>

      
    </div>
    
    
    

    

    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/CTF/" rel="tag"># CTF</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2020/03/21/Python3-2020-3-22-%E5%91%A8%E6%8A%A54/" rel="next" title="Python3-2020/3/22-周报4">
                <i class="fa fa-chevron-left"></i> Python3-2020/3/22-周报4
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2020/04/04/Python3-2020-4-4-%E5%91%A8%E6%8A%A56/" rel="prev" title="Python3-2020/4/4-周报6">
                Python3-2020/4/4-周报6 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            Table of Contents
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            Overview
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <img class="site-author-image" itemprop="image"
                src="/images/header.jpg"
                alt="51nd0re1" />
            
              <p class="site-author-name" itemprop="name">51nd0re1</p>
              <p class="site-description motion-element" itemprop="description"></p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/archives">
              
                  <span class="site-state-item-count">43</span>
                  <span class="site-state-item-name">posts</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/categories/index.html">
                  <span class="site-state-item-count">4</span>
                  <span class="site-state-item-name">categories</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/tags/index.html">
                  <span class="site-state-item-count">2</span>
                  <span class="site-state-item-name">tags</span>
                </a>
              </div>
            

          </nav>

          
            <div class="feed-link motion-element">
              <a href="/rss2.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
                
                  <span class="links-of-author-item">
                    <a href="https://github.com/diao-diao" target="_blank" title="GitHub">
                      
                        <i class="fa fa-fw fa-github"></i>GitHub</a>
                  </span>
                
                  <span class="links-of-author-item">
                    <a href="mailto:302684932@qq.com" target="_blank" title="E-Mail">
                      
                        <i class="fa fa-fw fa-envelope"></i>E-Mail</a>
                  </span>
                
            </div>
          

          
          

          
          
            <div class="links-of-blogroll motion-element links-of-blogroll-block">
              <div class="links-of-blogroll-title">
                <i class="fa  fa-fw fa-link"></i>
                友情链接
              </div>
              <ul class="links-of-blogroll-list">
                
                  <li class="links-of-blogroll-item">
                    <a href="https://l0x1c.github.io/" title="L0x1c" target="_blank">L0x1c</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="https://15h3na0.xyz/" title="15h3na0" target="_blank">15h3na0</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="https://p3rh4ps.top/" title="p3rh4ps" target="_blank">p3rh4ps</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="https://www.cnblogs.com/yesec/" title="yesec" target="_blank">yesec</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="https://ch4ser-go.github.io/" title="ch4ser" target="_blank">ch4ser</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="http://www.ggb0n.cool/" title="ggb0n" target="_blank">ggb0n</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="http://github.mrkaixin.computer/" title="mrkaixin" target="_blank">mrkaixin</a>
                  </li>
                
                  <li class="links-of-blogroll-item">
                    <a href="https://imagin.vip/" title="imagin" target="_blank">imagin</a>
                  </li>
                
              </ul>
            </div>
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#XPath"><span class="nav-number">1.</span> <span class="nav-text">XPath</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#常用规则"><span class="nav-number">1.1.</span> <span class="nav-text">常用规则</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#实例"><span class="nav-number">1.2.</span> <span class="nav-text">实例</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#所有节点"><span class="nav-number">1.3.</span> <span class="nav-text">所有节点</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#子节点"><span class="nav-number">1.4.</span> <span class="nav-text">子节点</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#父节点"><span class="nav-number">1.5.</span> <span class="nav-text">父节点</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#属性匹配"><span class="nav-number">1.6.</span> <span class="nav-text">属性匹配</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#文本获取"><span class="nav-number">1.7.</span> <span class="nav-text">文本获取</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#属性获取"><span class="nav-number">1.8.</span> <span class="nav-text">属性获取</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#属性多值匹配"><span class="nav-number">1.9.</span> <span class="nav-text">属性多值匹配</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#多属性匹配"><span class="nav-number">1.10.</span> <span class="nav-text">多属性匹配</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#Beautiful-Soup"><span class="nav-number">2.</span> <span class="nav-text">Beautiful Soup</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#解析器"><span class="nav-number">2.1.</span> <span class="nav-text">解析器</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#基本用法"><span class="nav-number">2.2.</span> <span class="nav-text">基本用法</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#节点选择器"><span class="nav-number">2.3.</span> <span class="nav-text">节点选择器</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#选择元素"><span class="nav-number">2.3.1.</span> <span class="nav-text">选择元素</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#提取信息"><span class="nav-number">2.3.2.</span> <span class="nav-text">提取信息</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#获取名称"><span class="nav-number">2.3.2.1.</span> <span class="nav-text">获取名称</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#获取属性"><span class="nav-number">2.3.2.2.</span> <span class="nav-text">获取属性</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#嵌套选择"><span class="nav-number">2.3.3.</span> <span class="nav-text">嵌套选择</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#关联选择"><span class="nav-number">2.3.4.</span> <span class="nav-text">关联选择</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#子节点和子孙节点"><span class="nav-number">2.3.4.1.</span> <span class="nav-text">子节点和子孙节点</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#父节点和祖先节点"><span class="nav-number">2.3.4.2.</span> <span class="nav-text">父节点和祖先节点</span></a></li></ol></li></ol></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#总结"><span class="nav-number">3.</span> <span class="nav-text">总结</span></a></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">51nd0re1</span>

  
</div>


  <div class="powered-by">Powered by <a class="theme-link" target="_blank" href="https://hexo.io">Hexo</a></div>



  <span class="post-meta-divider">|</span>



  <div class="theme-info">Theme &mdash; <a class="theme-link" target="_blank" href="https://github.com/iissnan/hexo-theme-next">NexT.Gemini</a> v5.1.4</div>




        







        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  


  











  
  
    <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  

  
  
    <script type="text/javascript" src="/lib/canvas-nest/canvas-nest.min.js"></script>
  


  


  <script type="text/javascript" src="/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  












  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  
  

  
  


  

  

  
  <!-- 页面点击小红心 -->
<script type="text/javascript" src="/js/src/clicklove.js"></script>

</body>
</html>
